Objective

The goal of your project is to predict the manner in which they did the exercise. This is the “classe” variable in the training set. You may use any of the other variables to predict with.

In this project, your goal will be to use data from accelerometers on the belt, forearm, arm, and dumbell of 6 participants. They were asked to perform barbell lifts correctly and incorrectly in 5 different ways. More information is available from the website here (see the section on the Weight Lifting Exercise Dataset).

Data

The data for this project come from this source - The training data for this project are available here - The test data are available here

# import libraries
library(plotly);library(caret);

# import data
train <- read.csv("pml-training.csv")
test <- read.csv("pml-testing.csv")

Data Wrangling

First we need to clean up our dataset and only use column with data. - Remove unwanted columns - Handle NAs

# removing columns with NAs
train <- train[, colSums(is.na(train)) == 0]

NZV <- nearZeroVar(train)
train <- train[, -NZV]

# removing columns with unneeded data
train <- train[, -c(1:7)]

# examine our objective column
plot_ly(x = train$classe, type = "histogram", histnorm = "probability") %>%
  layout(title = "Propotion of Classe variable in Train dataset",
         xaxis= list(title = "Classe"), yaxis= list(title = "Probability") )

Prediction Technique

Two main techniques will be Random Forest & MultiNomial modeling.

library(rpart)
library(rpart.plot)

# create training, validation
inTrain <- createDataPartition(train$classe, p = 0.7, list = FALSE)
training <- train[inTrain, ]
validation <- train[-inTrain, ]

training$classe <- as.factor(training$classe)
validation$classe <- as.factor(validation$classe)

# build model & plot result
mod1 <- rpart(classe ~ ., data = training, method = "class",na.action = na.pass)
rpart.plot(mod1, main="Model#1: Decision Tree")

# use model#1 to calculate prediction
pred1 <- predict(mod1, newdata=validation, type = "class")
confusionTree <- confusionMatrix(pred1, validation$classe)
confusionTree
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1472  215   24   72   51
##          B   56  613   96   32  160
##          C   29   80  734  141  162
##          D   93  189  141  679  105
##          E   24   42   31   40  604
## 
## Overall Statistics
##                                           
##                Accuracy : 0.697           
##                  95% CI : (0.6851, 0.7088)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6161          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.8793   0.5382   0.7154   0.7044   0.5582
## Specificity            0.9140   0.9275   0.9152   0.8927   0.9715
## Pos Pred Value         0.8026   0.6405   0.6405   0.5626   0.8151
## Neg Pred Value         0.9501   0.8933   0.9384   0.9391   0.9071
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2501   0.1042   0.1247   0.1154   0.1026
## Detection Prevalence   0.3116   0.1626   0.1947   0.2051   0.1259
## Balanced Accuracy      0.8967   0.7329   0.8153   0.7985   0.7649
library(randomForest)

# build model & plot result .. 
mod2 <- randomForest(classe ~ ., data=training, ntree=50, mtry=5, importance=TRUE)
plot(mod2, log="y")

# use model#2 to calculate prediction
pred2 <- predict(mod2, validation)
confusionRF <- confusionMatrix(pred2, validation$classe)
confusionRF
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    A    B    C    D    E
##          A 1671    3    0    0    0
##          B    2 1133    9    0    0
##          C    0    3 1017   18    0
##          D    0    0    0  944    0
##          E    1    0    0    2 1082
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9935          
##                  95% CI : (0.9911, 0.9954)
##     No Information Rate : 0.2845          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9918          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: A Class: B Class: C Class: D Class: E
## Sensitivity            0.9982   0.9947   0.9912   0.9793   1.0000
## Specificity            0.9993   0.9977   0.9957   1.0000   0.9994
## Pos Pred Value         0.9982   0.9904   0.9798   1.0000   0.9972
## Neg Pred Value         0.9993   0.9987   0.9981   0.9960   1.0000
## Prevalence             0.2845   0.1935   0.1743   0.1638   0.1839
## Detection Rate         0.2839   0.1925   0.1728   0.1604   0.1839
## Detection Prevalence   0.2845   0.1944   0.1764   0.1604   0.1844
## Balanced Accuracy      0.9987   0.9962   0.9935   0.9896   0.9997
predict(mod2, newdata=test)
##  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 
##  B  A  B  A  A  E  D  B  A  A  B  C  B  A  E  E  A  B  B  B 
## Levels: A B C D E